Classifying Users based upon EV Charging Patterns#
import csv
import sqlite3
import pandas as pd
df=pd.read_csv('./ev_charging_patterns.csv')
df.dropna(inplace=True)
df.to_csv('./ev_charging_patterns_null_removed.csv')
data_path='./ev_charging_patterns_null_removed.csv'
conn=sqlite3.connect('ev_chargingdb')
ev_cursor=conn.cursor()
Step-1 Creating Normalized Database
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()
# Drop existing tables (if any)
drop_queries = [
"DROP TABLE IF EXISTS Users;",
"DROP TABLE IF EXISTS ChargingStations;",
"DROP TABLE IF EXISTS ChargingSessions;",
"DROP TABLE IF EXISTS EnvironmentalData;"
]
for query in drop_queries:
ev_cursor.execute(query)
conn.commit()
# Create Users Table
create_users_table = """
CREATE TABLE Users (
user_id VARCHAR(50) PRIMARY KEY,
user_type VARCHAR(50),
vehicle_model VARCHAR(50),
vehicle_age_years FLOAT
);
"""
ev_cursor.execute(create_users_table)
conn.commit()
print("Users table created successfully.")
# Create ChargingStations Table
create_station_table = """
CREATE TABLE ChargingStations (
station_id VARCHAR(50) PRIMARY KEY,
station_location VARCHAR(100),
charger_type VARCHAR(50)
);
"""
ev_cursor.execute(create_station_table)
conn.commit()
print("ChargingStations table created successfully.")
# Create ChargingSessions Table (with foreign keys)
create_sessions_table = """
CREATE TABLE ChargingSessions (
session_id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id VARCHAR(50),
station_id VARCHAR(50),
start_time TIMESTAMP,
end_time TIMESTAMP,
duration_hours FLOAT,
energy_consumed_kwh FLOAT,
charging_cost_usd FLOAT,
charging_rate_kw FLOAT,
soc_start_percent FLOAT,
soc_end_percent FLOAT,
time_of_day VARCHAR(50),
day_of_week VARCHAR(50),
FOREIGN KEY(user_id) REFERENCES Users(user_id),
FOREIGN KEY(station_id) REFERENCES ChargingStations(station_id)
);
"""
ev_cursor.execute(create_sessions_table)
conn.commit()
print("ChargingSessions table created successfully.")
# Create EnvironmentalData Table (with foreign key)
create_env_data_table = """
CREATE TABLE EnvironmentalData (
session_id INTEGER,
distance_driven_km FLOAT,
temperature_c FLOAT,
battery_capacity_kwh FLOAT,
PRIMARY KEY(session_id),
FOREIGN KEY(session_id) REFERENCES ChargingSessions(session_id)
);
"""
ev_cursor.execute(create_env_data_table)
conn.commit()
print("EnvironmentalData table created successfully.")
# Close connection
conn.close()
Users table created successfully.
ChargingStations table created successfully.
ChargingSessions table created successfully.
EnvironmentalData table created successfully.
import csv
import sqlite3
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
ev_cursor = conn.cursor()
# CSV path
data_path = '/Users/saisatvikhlakkimsetty/Downloads/ev_charging_patterns_null_removed.csv'
insert_user_query = """
INSERT OR IGNORE INTO Users (user_id, user_type, vehicle_model, vehicle_age_years)
VALUES (?, ?, ?, ?);
"""
insert_station_query = """
INSERT OR IGNORE INTO ChargingStations (station_id, station_location, charger_type)
VALUES (?, ?, ?);
"""
insert_sessions_query = """
INSERT INTO ChargingSessions (user_id, station_id, start_time, end_time, duration_hours,
energy_consumed_kwh, charging_cost_usd, charging_rate_kw,
soc_start_percent, soc_end_percent, time_of_day, day_of_week)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);
"""
insert_env_data_query = """
INSERT INTO EnvironmentalData (session_id, distance_driven_km, temperature_c, battery_capacity_kwh)
VALUES (?, ?, ?, ?);
"""
# Open CSV and insert data
with open(data_path, 'r') as file:
reader = csv.DictReader(file)
for row in reader:
try:
# Insert user data (ignores if already exists)
ev_cursor.execute(insert_user_query, (
row['User ID'],
row['User Type'],
row['Vehicle Model'],
float(row['Vehicle Age (years)'])
))
# Insert station data (ignores if already exists)
ev_cursor.execute(insert_station_query, (
row['Charging Station ID'],
row['Charging Station Location'],
row['Charger Type']
))
# Insert session data
ev_cursor.execute(insert_sessions_query, (
row['User ID'],
row['Charging Station ID'],
row['Charging Start Time'],
row['Charging End Time'],
float(row['Charging Duration (hours)']),
float(row['Energy Consumed (kWh)']),
float(row['Charging Cost (USD)']),
float(row['Charging Rate (kW)']),
float(row['State of Charge (Start %)']),
float(row['State of Charge (End %)']),
row['Time of Day'],
row['Day of Week']
))
# Get the last inserted session ID
session_id = ev_cursor.lastrowid
# Insert environmental data using the session_id from the last insert
ev_cursor.execute(insert_env_data_query, (
session_id,
float(row['Distance Driven (since last charge) (km)']),
float(row['Temperature (ยฐC)']),
float(row['Battery Capacity (kWh)'])
))
# Commit changes to the database
conn.commit()
except sqlite3.IntegrityError as e:
print(f"Integrity error for User ID {row['User ID']}: {e}")
conn.rollback()
except Exception as e:
print(f"Error for User ID {row['User ID']}: {e}")
conn.rollback()
# Close connection
conn.close()
import pandas as pd
import sqlite3
# Connect to SQLite database
conn = sqlite3.connect('ev_charging.db')
# SQL query to join the tables and combine data
select_data_from_tables = """
SELECT
u.user_id,
u.user_type,
u.vehicle_model,
u.vehicle_age_years,
cs.station_id,
cs.station_location,
cs.charger_type,
s.start_time,
s.end_time,
s.duration_hours,
s.energy_consumed_kwh,
s.charging_cost_usd,
s.charging_rate_kw,
s.soc_start_percent,
s.soc_end_percent,
s.time_of_day,
s.day_of_week,
e.distance_driven_km,
e.temperature_c,
e.battery_capacity_kwh
FROM Users u
JOIN ChargingSessions s ON u.user_id = s.user_id
JOIN ChargingStations cs ON s.station_id = cs.station_id
JOIN EnvironmentalData e ON s.session_id = e.session_id;
"""
# Execute query and load the data into a DataFrame
df = pd.read_sql_query(select_data_from_tables, conn)
# Display the DataFrame
print(df)
# Close the connection
conn.close()
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
... ... ... ... ...
1126 User_1316 Commuter Nissan Leaf 7.0
1127 User_1317 Casual Driver BMW i3 4.0
1128 User_1318 Commuter Nissan Leaf 5.0
1129 User_1319 Commuter Chevy Bolt 5.0
1130 User_1320 Commuter Nissan Leaf 5.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
... ... ... ... ...
1126 Station_57 New York Level 2 2024-02-24 19:00:00
1127 Station_40 New York Level 1 2024-02-24 20:00:00
1128 Station_374 New York DC Fast Charger 2024-02-24 21:00:00
1129 Station_336 Chicago DC Fast Charger 2024-02-24 22:00:00
1130 Station_128 San Francisco Level 1 2024-02-24 23:00:00
end_time duration_hours energy_consumed_kwh \
0 2024-01-01 00:39:00 0.591363 60.712346
1 2024-01-01 03:01:00 3.133652 12.339275
2 2024-01-01 04:48:00 2.452653 19.128876
3 2024-01-01 06:42:00 1.266431 79.457824
4 2024-01-01 05:46:00 2.019765 19.629104
... ... ... ...
1126 2024-02-24 20:30:00 1.426444 42.011654
1127 2024-02-24 20:44:00 3.238212 68.185853
1128 2024-02-24 23:03:00 3.267122 18.895102
1129 2024-02-24 23:20:00 2.754527 13.756252
1130 2024-02-24 23:56:00 3.740970 63.652570
charging_cost_usd charging_rate_kw soc_start_percent soc_end_percent \
0 13.087717 36.389181 29.371576 86.119962
1 21.128448 30.677735 10.115778 84.664344
2 35.667270 27.513593 6.854604 69.917615
3 13.036239 32.882870 83.120003 99.624328
4 10.161471 10.215712 54.258950 63.743786
... ... ... ... ...
1126 22.081164 5.895475 39.204102 83.915952
1127 5.067806 18.388012 31.456375 93.096461
1128 37.255002 45.482066 71.903081 78.678879
1129 39.046146 38.148183 76.187997 65.926573
1130 10.863674 33.704226 59.338076 56.692439
time_of_day day_of_week distance_driven_km temperature_c \
0 Evening Tuesday 293.602111 27.947953
1 Morning Monday 112.112804 14.311026
2 Morning Thursday 71.799253 21.002002
3 Evening Saturday 199.577785 38.316313
4 Morning Saturday 203.661847 -7.834199
... ... ... ... ...
1126 Evening Sunday 239.601075 1.919655
1127 Evening Tuesday 164.376022 34.029775
1128 Evening Tuesday 226.519258 20.358761
1129 Afternoon Sunday 291.494076 24.134598
1130 Evening Monday 14.449236 -6.966593
battery_capacity_kwh
0 108.463007
1 100.000000
2 75.000000
3 50.000000
4 50.000000
... ...
1126 100.000000
1127 100.000000
1128 100.000000
1129 85.000000
1130 120.447195
[1131 rows x 20 columns]
Verifying the data retreived from the database
df.head()
| user_id | user_type | vehicle_model | vehicle_age_years | station_id | station_location | charger_type | start_time | end_time | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | time_of_day | day_of_week | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | User_1 | Commuter | BMW i3 | 2.0 | Station_391 | Houston | DC Fast Charger | 2024-01-01 00:00:00 | 2024-01-01 00:39:00 | 0.591363 | 60.712346 | 13.087717 | 36.389181 | 29.371576 | 86.119962 | Evening | Tuesday | 293.602111 | 27.947953 | 108.463007 |
| 1 | User_2 | Casual Driver | Hyundai Kona | 3.0 | Station_428 | San Francisco | Level 1 | 2024-01-01 01:00:00 | 2024-01-01 03:01:00 | 3.133652 | 12.339275 | 21.128448 | 30.677735 | 10.115778 | 84.664344 | Morning | Monday | 112.112804 | 14.311026 | 100.000000 |
| 2 | User_3 | Commuter | Chevy Bolt | 2.0 | Station_181 | San Francisco | Level 2 | 2024-01-01 02:00:00 | 2024-01-01 04:48:00 | 2.452653 | 19.128876 | 35.667270 | 27.513593 | 6.854604 | 69.917615 | Morning | Thursday | 71.799253 | 21.002002 | 75.000000 |
| 3 | User_4 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_327 | Houston | Level 1 | 2024-01-01 03:00:00 | 2024-01-01 06:42:00 | 1.266431 | 79.457824 | 13.036239 | 32.882870 | 83.120003 | 99.624328 | Evening | Saturday | 199.577785 | 38.316313 | 50.000000 |
| 4 | User_5 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_108 | Los Angeles | Level 1 | 2024-01-01 04:00:00 | 2024-01-01 05:46:00 | 2.019765 | 19.629104 | 10.161471 | 10.215712 | 54.258950 | 63.743786 | Morning | Saturday | 203.661847 | -7.834199 | 50.000000 |
df.dropna(inplace=True)
df.isna().sum()
user_id 0
user_type 0
vehicle_model 0
vehicle_age_years 0
station_id 0
station_location 0
charger_type 0
start_time 0
end_time 0
duration_hours 0
energy_consumed_kwh 0
charging_cost_usd 0
charging_rate_kw 0
soc_start_percent 0
soc_end_percent 0
time_of_day 0
day_of_week 0
distance_driven_km 0
temperature_c 0
battery_capacity_kwh 0
dtype: int64
%pip install scikit-learn
%pip install seaborn
Requirement already satisfied: scikit-learn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (1.6.0)
Requirement already satisfied: numpy>=1.19.5 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (2.0.2)
Requirement already satisfied: scipy>=1.6.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.13.1)
Requirement already satisfied: joblib>=1.2.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (1.4.2)
Requirement already satisfied: threadpoolctl>=3.1.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from scikit-learn) (3.5.0)
Note: you may need to restart the kernel to use updated packages.
Requirement already satisfied: seaborn in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (0.13.2)
Requirement already satisfied: numpy!=1.24.0,>=1.20 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.0.2)
Requirement already satisfied: pandas>=1.2 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (2.2.3)
Requirement already satisfied: matplotlib!=3.6.1,>=3.4 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from seaborn) (3.9.2)
Requirement already satisfied: contourpy>=1.0.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.3.0)
Requirement already satisfied: cycler>=0.10 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (4.54.1)
Requirement already satisfied: kiwisolver>=1.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (1.4.7)
Requirement already satisfied: packaging>=20.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (24.1)
Requirement already satisfied: pillow>=8 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (10.4.0)
Requirement already satisfied: pyparsing>=2.3.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (3.1.4)
Requirement already satisfied: python-dateutil>=2.7 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from matplotlib!=3.6.1,>=3.4->seaborn) (2.9.0.post0)
Requirement already satisfied: pytz>=2020.1 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: tzdata>=2022.7 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from pandas>=1.2->seaborn) (2024.2)
Requirement already satisfied: six>=1.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from python-dateutil>=2.7->matplotlib!=3.6.1,>=3.4->seaborn) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
X = df.drop(columns=['user_type'])
y = df['user_type']
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, stratify=None, random_state=42
)
Stratifying the data/ target variable
user_type_distribution = df['user_type'].value_counts(normalize=True)
print("Original dataset 'User Type' distribution:")
print(user_type_distribution)
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['user_type'], random_state=42)
train_distribution = train_df['user_type'].value_counts(normalize=True)
test_distribution = test_df['user_type'].value_counts(normalize=True)
print("\nTrain set 'User Type' distribution:")
print(train_distribution)
print("\nTest set 'User Type' distribution:")
print(test_distribution)
Original dataset 'User Type' distribution:
user_type
Commuter 0.357206
Long-Distance Traveler 0.336870
Casual Driver 0.305924
Name: proportion, dtype: float64
Train set 'User Type' distribution:
user_type
Commuter 0.357301
Long-Distance Traveler 0.336283
Casual Driver 0.306416
Name: proportion, dtype: float64
Test set 'User Type' distribution:
user_type
Commuter 0.356828
Long-Distance Traveler 0.339207
Casual Driver 0.303965
Name: proportion, dtype: float64
pip install ipywidgets
Requirement already satisfied: ipywidgets in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (8.1.5)
Requirement already satisfied: comm>=0.1.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (0.2.2)
Requirement already satisfied: ipython>=6.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (8.26.0)
Requirement already satisfied: traitlets>=4.3.1 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipywidgets) (5.14.3)
Requirement already satisfied: widgetsnbextension~=4.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (4.0.13)
Requirement already satisfied: jupyterlab-widgets~=3.0.12 in /Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/site-packages (from ipywidgets) (3.0.13)
Requirement already satisfied: decorator in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (5.1.1)
Requirement already satisfied: jedi>=0.16 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.19.1)
Requirement already satisfied: matplotlib-inline in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.1.7)
Requirement already satisfied: prompt-toolkit<3.1.0,>=3.0.41 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (3.0.47)
Requirement already satisfied: pygments>=2.4.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (2.18.0)
Requirement already satisfied: stack-data in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (0.6.3)
Requirement already satisfied: pexpect>4.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from ipython>=6.1.0->ipywidgets) (4.9.0)
Requirement already satisfied: parso<0.9.0,>=0.8.3 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from jedi>=0.16->ipython>=6.1.0->ipywidgets) (0.8.4)
Requirement already satisfied: ptyprocess>=0.5 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from pexpect>4.3->ipython>=6.1.0->ipywidgets) (0.7.0)
Requirement already satisfied: wcwidth in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from prompt-toolkit<3.1.0,>=3.0.41->ipython>=6.1.0->ipywidgets) (0.2.13)
Requirement already satisfied: executing>=1.2.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.0.1)
Requirement already satisfied: asttokens>=2.1.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (2.4.1)
Requirement already satisfied: pure-eval in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from stack-data->ipython>=6.1.0->ipywidgets) (0.2.3)
Requirement already satisfied: six>=1.12.0 in /Users/saisatvikhlakkimsetty/Library/Python/3.12/lib/python/site-packages (from asttokens>=2.1.0->stack-data->ipython>=6.1.0->ipywidgets) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
Exploratory Data Analysis
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Data Profile Report", explorative=True)
profile.to_file("ev_charging_patterns_profile_report.html")
profile.to_notebook_iframe()
โโโ Observations:
Missing Values The dataset contains 66 missing values, which accounts for approximately 0.3% of the data. Missing values are present in the following columns: โEnergy Consumed (kWh)โ,โCharging Rate (kW)โ,โDistance Driven (since last charge) (km)โ
Duplicates A thorough check reveals that there are no duplicate rows in the dataset, indicating data uniqueness.
Categorical Variables The dataset includes the following categorical variables: โUser Typeโ: Represents the type of user which is our traget ariable โVehicle Modelโ: Indicates the vehicle model being used. โCharger Typeโ: Specifies the type of charger employed during the session. โCharging Station Locationโ: Describes the location of the charging station. All these variables have low cardinality (each contains between 3 and 5 unique values), making them manageable for encoding or analysis.
Numerical Variables Some inconsistencies and unusual patterns were identified: โState of Charge (Start %)โ and โState of Charge (End %)โ: Both columns have values exceeding 100%, which is logically inconsistent and requires correction or capping.
โTemperature (ยฐC)โ: The recorded temperatures range from -10.72ยฐC to 73.17ยฐC, suggesting potential outliers that need further investigation.
Strong correlations observed: โEnergy Consumed (kWh)โ and โCharging Duration (hours)โ have a strong positive correlation (0.95). โBattery Capacity (kWh)โ and โCharging Rate (kW)โ show a moderate positive correlation (0.68).
Date/Time Variables
The dataset includes two datetime variables: โCharging Start Timeโ,โCharging End Timeโ These variables can be leveraged to derive new features like charging duration, time of day, or day of the week.
Distributions
Some numerical features show distinct patterns: โCharging Duration (hours)โ and โCharging Cost (USD)โ: Both exhibit right-skewed distributions, indicating a few sessions with unusually high values.
โโโ
df.head()
| user_id | user_type | vehicle_model | vehicle_age_years | station_id | station_location | charger_type | start_time | end_time | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | time_of_day | day_of_week | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | User_1 | Commuter | BMW i3 | 2.0 | Station_391 | Houston | DC Fast Charger | 2024-01-01 00:00:00 | 2024-01-01 00:39:00 | 0.591363 | 60.712346 | 13.087717 | 36.389181 | 29.371576 | 86.119962 | Evening | Tuesday | 293.602111 | 27.947953 | 108.463007 |
| 1 | User_2 | Casual Driver | Hyundai Kona | 3.0 | Station_428 | San Francisco | Level 1 | 2024-01-01 01:00:00 | 2024-01-01 03:01:00 | 3.133652 | 12.339275 | 21.128448 | 30.677735 | 10.115778 | 84.664344 | Morning | Monday | 112.112804 | 14.311026 | 100.000000 |
| 2 | User_3 | Commuter | Chevy Bolt | 2.0 | Station_181 | San Francisco | Level 2 | 2024-01-01 02:00:00 | 2024-01-01 04:48:00 | 2.452653 | 19.128876 | 35.667270 | 27.513593 | 6.854604 | 69.917615 | Morning | Thursday | 71.799253 | 21.002002 | 75.000000 |
| 3 | User_4 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_327 | Houston | Level 1 | 2024-01-01 03:00:00 | 2024-01-01 06:42:00 | 1.266431 | 79.457824 | 13.036239 | 32.882870 | 83.120003 | 99.624328 | Evening | Saturday | 199.577785 | 38.316313 | 50.000000 |
| 4 | User_5 | Long-Distance Traveler | Hyundai Kona | 1.0 | Station_108 | Los Angeles | Level 1 | 2024-01-01 04:00:00 | 2024-01-01 05:46:00 | 2.019765 | 19.629104 | 10.161471 | 10.215712 | 54.258950 | 63.743786 | Morning | Saturday | 203.661847 | -7.834199 | 50.000000 |
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1131 entries, 0 to 1130
Data columns (total 20 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 user_id 1131 non-null object
1 user_type 1131 non-null object
2 vehicle_model 1131 non-null object
3 vehicle_age_years 1131 non-null float64
4 station_id 1131 non-null object
5 station_location 1131 non-null object
6 charger_type 1131 non-null object
7 start_time 1131 non-null object
8 end_time 1131 non-null object
9 duration_hours 1131 non-null float64
10 energy_consumed_kwh 1131 non-null float64
11 charging_cost_usd 1131 non-null float64
12 charging_rate_kw 1131 non-null float64
13 soc_start_percent 1131 non-null float64
14 soc_end_percent 1131 non-null float64
15 time_of_day 1131 non-null object
16 day_of_week 1131 non-null object
17 distance_driven_km 1131 non-null float64
18 temperature_c 1131 non-null float64
19 battery_capacity_kwh 1131 non-null float64
dtypes: float64(10), object(10)
memory usage: 176.8+ KB
df.describe()
| vehicle_age_years | duration_hours | energy_consumed_kwh | charging_cost_usd | charging_rate_kw | soc_start_percent | soc_end_percent | distance_driven_km | temperature_c | battery_capacity_kwh | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 | 1131.000000 |
| mean | 3.604227 | 2.303177 | 42.915668 | 22.488351 | 26.014166 | 49.230036 | 75.012917 | 153.663101 | 15.305780 | 74.427818 |
| std | 2.324090 | 1.065878 | 22.201286 | 10.792504 | 14.010292 | 24.170435 | 16.920463 | 85.549751 | 14.751266 | 20.828350 |
| min | 0.000000 | 0.095314 | 0.045772 | 0.307085 | 1.472549 | 2.325959 | 7.604224 | 1.899538 | -10.724770 | 1.536540 |
| 25% | 2.000000 | 1.425281 | 24.248936 | 13.133925 | 13.949809 | 27.661992 | 62.264460 | 80.954993 | 3.009498 | 62.000000 |
| 50% | 4.000000 | 2.312675 | 42.865611 | 21.828088 | 25.838488 | 48.947886 | 75.100944 | 152.257515 | 14.641853 | 75.000000 |
| 75% | 6.000000 | 3.145998 | 61.544055 | 31.675804 | 37.508677 | 69.783816 | 88.245070 | 225.469628 | 27.824244 | 85.000000 |
| max | 11.688592 | 7.635145 | 152.238758 | 69.407743 | 97.342255 | 125.087227 | 177.708666 | 398.364775 | 73.169588 | 193.003074 |
df.dtypes
user_id object
user_type object
vehicle_model object
vehicle_age_years float64
station_id object
station_location object
charger_type object
start_time object
end_time object
duration_hours float64
energy_consumed_kwh float64
charging_cost_usd float64
charging_rate_kw float64
soc_start_percent float64
soc_end_percent float64
time_of_day object
day_of_week object
distance_driven_km float64
temperature_c float64
battery_capacity_kwh float64
dtype: object
df.duplicated().sum()
np.int64(0)
df.isna().sum()
user_id 0
user_type 0
vehicle_model 0
vehicle_age_years 0
station_id 0
station_location 0
charger_type 0
start_time 0
end_time 0
duration_hours 0
energy_consumed_kwh 0
charging_cost_usd 0
charging_rate_kw 0
soc_start_percent 0
soc_end_percent 0
time_of_day 0
day_of_week 0
distance_driven_km 0
temperature_c 0
battery_capacity_kwh 0
dtype: int64
df.dropna(subset=['vehicle_model','duration_hours','end_time'], inplace=True)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Pie chart for the distribution of user types
plt.figure(figsize=(8, 5))
pie_plot = df['user_type'].value_counts().plot.pie(
autopct='%1.1f%%',
startangle=90,
colors=sns.color_palette('Set2'),
fontsize=12
)
plt.title('Distribution of User Types', fontsize=16)
plt.show()
%pip install -q dagshub mlflow
Note: you may need to restart the kernel to use updated packages.
Experiment 1
import dagshub
dagshub.init(repo_owner='saisatvikh', repo_name='final_repo', mlflow=True)
Accessing as saisatvikh
Initialized MLflow to track repo "saisatvikh/final_repo"
Repository saisatvikh/final_repo initialized!
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer,OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression,RidgeClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score
categorical_features = X.select_dtypes(include=['object']).columns
for col in categorical_features:
X[col] = X[col].astype(str)
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', LogisticRegression(solver='liblinear'))
])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=10, scoring='f1_macro')
print(f"Cross-validation F1-score (mean): {cv_results.mean():.4f}")
print(f"Cross-validation F1-score (std): {cv_results.std():.4f}")
Cross-validation F1-score (mean): 0.3656
Cross-validation F1-score (std): 0.0559
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', LogisticRegression(solver='liblinear'))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', LogisticRegression(solver='liblinear'))])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
LogisticRegression(solver='liblinear')
y_pred = model_pipeline.predict(X_test)
logistic_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {logistic_f1_score:.4f}")
F1-score on test data: 0.3007
cm = confusion_matrix(y_test, y_pred)
tp = cm.diagonal()
tn = cm.sum(axis=1) - tp
fp = cm.sum(axis=0) - tp
fn = cm.sum() - (tp + fp + tn)
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Logistic Regression")
mlflow.log_param("scaler", "StandardScaler")
# Cross-validation to compute the mean and std of f1 score
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
# Train the model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
# Compute F1 score (macro average)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Choose a single class (e.g., class 0)
class_index = 0 # Change this to select a different class if needed
# Extract metrics for the selected class
tp = cm[class_index, class_index] # True Positive: Diagonal element for the class
fn = cm[class_index].sum() - tp # False Negative: Sum of row - TP
fp = cm[:, class_index].sum() - tp # False Positive: Sum of column - TP
tn = cm.sum() - (tp + fn + fp) # True Negative: Total sum - (TP + FN + FP)
# Log the metrics for the selected class (e.g., class 0)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
# Log the model
mlflow.sklearn.log_model(model_pipeline, "logistic_regression_model")
# Print the logged metrics for the selected class
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
# Optionally, print the confusion matrix values for the selected class
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/21 09:57:21 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3007
Logged CV results (mean): 0.3572
Logged CV results (std): 0.0186
Confusion Matrix:
[[18 47 45]
[25 51 32]
[32 53 37]]
True Positives (Class 0): 18
True Negatives (Class 0): 173
False Positives (Class 0): 57
False Negatives (Class 0): 92
๐ View run capable-gull-50 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/87d5386ec89140e9acfa665964672f2f
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Ridge Classifier
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RidgeClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RidgeClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RidgeClassifier())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RidgeClassifier()
y_pred = model_pipeline.predict(X_test)
ridge_f1_score= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {ridge_f1_score:.4f}")
F1-score on test data: 0.3370
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Ridge Classifier")
mlflow.log_param("scaler", "StandardScaler")
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
cm = confusion_matrix(y_test, y_pred)
class_index = 0
tp = cm[class_index, class_index]
fn = cm[class_index].sum() - tp
fp = cm[:, class_index].sum() - tp
tn = cm.sum() - (tp + fn + fp)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
mlflow.sklearn.log_model(model_pipeline, "ridge_model")
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/21 09:57:38 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.3370
Logged CV results (mean): 0.3621
Logged CV results (std): 0.0105
Confusion Matrix:
[[25 48 37]
[25 53 30]
[35 48 39]]
True Positives (Class 0): 25
True Negatives (Class 0): 170
False Positives (Class 0): 60
False Negatives (Class 0): 85
๐ View run invincible-mule-617 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/db7828508a1e4941a87e64315d9b7489
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Random Forest Classifier
model_pipeline = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
model_pipeline.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RandomForestClassifier())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('preprocessor',
ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer()),
('scaler',
StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])),
('classifier', RandomForestClassifier())])ColumnTransformer(transformers=[('num',
Pipeline(steps=[('imputer', SimpleImputer()),
('scaler', StandardScaler())]),
Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')),
('cat',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='most_frequent')),
('onehot',
OneHotEncoder(handle_unknown='ignore'))]),
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object'))])Index(['vehicle_age_years', 'duration_hours', 'energy_consumed_kwh',
'charging_cost_usd', 'charging_rate_kw', 'soc_start_percent',
'soc_end_percent', 'distance_driven_km', 'temperature_c',
'battery_capacity_kwh'],
dtype='object')SimpleImputer()
StandardScaler()
Index(['user_id', 'vehicle_model', 'station_id', 'station_location',
'charger_type', 'start_time', 'end_time', 'time_of_day', 'day_of_week'],
dtype='object')SimpleImputer(strategy='most_frequent')
OneHotEncoder(handle_unknown='ignore')
RandomForestClassifier()
y_pred = model_pipeline.predict(X_test)
random_forest_f1= f1_score(y_test, y_pred,average='macro')
print(f"F1-score on test data: {random_forest_f1:.4f}")
F1-score on test data: 0.2592
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
# Log model and scaler as parameters
mlflow.log_param("model", "Random Forest Classifier")
mlflow.log_param("scaler", "StandardScaler")
# Cross-validation to compute the mean and std of f1 score
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
# Train the model
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
# Compute F1 score (macro average)
f1 = f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Choose a single class (e.g., class 0)
class_index = 0 # Change this to select a different class if needed
# Extract metrics for the selected class
tp = cm[class_index, class_index] # True Positive: Diagonal element for the class
fn = cm[class_index].sum() - tp # False Negative: Sum of row - TP
fp = cm[:, class_index].sum() - tp # False Positive: Sum of column - TP
tn = cm.sum() - (tp + fn + fp) # True Negative: Total sum - (TP + FN + FP)
# Log the metrics for the selected class (e.g., class 0)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
# Log the model
mlflow.sklearn.log_model(model_pipeline, "random_forest_model")
# Print the logged metrics for the selected class
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
# Optionally, print the confusion matrix values for the selected class
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/21 09:57:57 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.2980
Logged CV results (mean): 0.3316
Logged CV results (std): 0.0108
Confusion Matrix:
[[11 68 31]
[19 69 20]
[13 76 33]]
True Positives (Class 0): 11
True Negatives (Class 0): 198
False Positives (Class 0): 32
False Negatives (Class 0): 99
๐ View run nervous-smelt-320 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/b5a02bb70294472fa04567dc71161ded
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Polynomial Fetaures
import mlflow
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
numeric_features = [
'battery_capacity_kwh', 'charging_cost_usd',
'duration_hours', 'soc_end_percent', 'soc_start_percent',
'temperature_c', 'vehicle_age_years'
]
def add_polynomial_features(df, numeric_features, degree=2):
numeric_data = df[numeric_features]
poly = PolynomialFeatures(degree=degree, include_bias=False)
poly_features = poly.fit_transform(numeric_data)
poly_feature_names = poly.get_feature_names_out(numeric_features)
poly_df = pd.DataFrame(poly_features, columns=poly_feature_names, index=df.index)
df = pd.concat([df, poly_df], axis=1)
return df
df_with_poly_features = add_polynomial_features(df, numeric_features, degree=2)
mlflow.log_param("polynomial_degree", 2)
mlflow.log_param("numeric_features", numeric_features)
mlflow.log_metric("num_poly_features", len(df_with_poly_features.columns) - len(df.columns))
df_with_poly_features.to_csv("polynomial_features_dataset.csv", index=False)
mlflow.log_artifact("polynomial_features_dataset.csv")
print(df_with_poly_features.head())
print("Feature engineering results logged in MLflow.")
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
end_time duration_hours ... soc_end_percent^2 \
0 2024-01-01 00:39:00 0.591363 ... 7416.647931
1 2024-01-01 03:01:00 3.133652 ... 7168.051082
2 2024-01-01 04:48:00 2.452653 ... 4888.472921
3 2024-01-01 06:42:00 1.266431 ... 9925.006656
4 2024-01-01 05:46:00 2.019765 ... 4063.270258
soc_end_percent soc_start_percent soc_end_percent temperature_c \
0 2529.479020 2406.876668
1 856.445674 1211.633594
2 479.257596 1468.409884
3 8280.774408 3817.236936
4 3458.670880 -499.381507
soc_end_percent vehicle_age_years soc_start_percent^2 \
0 172.239925 862.689475
1 253.993031 102.328957
2 139.835230 46.985602
3 99.624328 6908.934892
4 63.743786 2944.033622
soc_start_percent temperature_c soc_start_percent vehicle_age_years \
0 820.875427 58.743152
1 144.767153 30.347333
2 143.960415 13.709209
3 3184.852063 83.120003
4 -425.075411 54.258950
temperature_c^2 temperature_c vehicle_age_years vehicle_age_years^2
0 781.088080 55.895906 4.0
1 204.805455 42.933077 9.0
2 441.084081 42.004004 4.0
3 1468.139854 38.316313 1.0
4 61.374674 -7.834199 1.0
[5 rows x 55 columns]
Feature engineering results logged in MLflow.
Attribute Combinations
import mlflow
import pandas as pd
import numpy as np
def ensure_numeric_columns(df, columns):
for col in columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df
def add_attribute_combinations(df):
"""Creates new features by combining existing attributes."""
df = df.copy()
numeric_columns = [
'energy_consumed_kwh', 'duration_hours', 'distance_driven_km',
'charging_cost_usd', 'soc_end_percent', 'soc_start_percent', 'temperature_c'
]
# Ensure relevant columns are numeric
df = ensure_numeric_columns(df, numeric_columns)
# Add new features
df['energy_per_duration'] = df['energy_consumed_kwh'] / df['duration_hours']
df['distance_per_duration'] = df['distance_driven_km'] / df['duration_hours']
df['charging_cost_per_kwh'] = df['charging_cost_usd'] / df['energy_consumed_kwh']
df['soc_diff'] = df['soc_end_percent'] - df['soc_start_percent']
df['temperature_adjusted_energy'] = df['energy_consumed_kwh'] / (1 + np.abs(df['temperature_c']))
# Handle infinite or NaN values that may result from division
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(0, inplace=True)
return df
# Add the features to the dataset
df_with_features = add_attribute_combinations(df)
# Log parameters for the feature combinations
mlflow.log_param("attribute_combination_features", [
"energy_per_duration", "distance_per_duration", "charging_cost_per_kwh",
"soc_diff", "temperature_adjusted_energy"
])
# Log metrics (e.g., number of new features created)
mlflow.log_metric("num_new_features", len(df_with_features.columns) - len(df.columns))
# Log the dataset with new features as an artifact
df_with_features.to_csv("attribute_combined_dataset.csv", index=False)
mlflow.log_artifact("attribute_combined_dataset.csv")
# Display the updated DataFrame
print(df_with_features.head())
mlflow.end_run()
print("Feature combinations logged in MLflow.")
user_id user_type vehicle_model vehicle_age_years \
0 User_1 Commuter BMW i3 2.0
1 User_2 Casual Driver Hyundai Kona 3.0
2 User_3 Commuter Chevy Bolt 2.0
3 User_4 Long-Distance Traveler Hyundai Kona 1.0
4 User_5 Long-Distance Traveler Hyundai Kona 1.0
station_id station_location charger_type start_time \
0 Station_391 Houston DC Fast Charger 2024-01-01 00:00:00
1 Station_428 San Francisco Level 1 2024-01-01 01:00:00
2 Station_181 San Francisco Level 2 2024-01-01 02:00:00
3 Station_327 Houston Level 1 2024-01-01 03:00:00
4 Station_108 Los Angeles Level 1 2024-01-01 04:00:00
end_time duration_hours ... time_of_day day_of_week \
0 2024-01-01 00:39:00 0.591363 ... Evening Tuesday
1 2024-01-01 03:01:00 3.133652 ... Morning Monday
2 2024-01-01 04:48:00 2.452653 ... Morning Thursday
3 2024-01-01 06:42:00 1.266431 ... Evening Saturday
4 2024-01-01 05:46:00 2.019765 ... Morning Saturday
distance_driven_km temperature_c battery_capacity_kwh \
0 293.602111 27.947953 108.463007
1 112.112804 14.311026 100.000000
2 71.799253 21.002002 75.000000
3 199.577785 38.316313 50.000000
4 203.661847 -7.834199 50.000000
energy_per_duration distance_per_duration charging_cost_per_kwh soc_diff \
0 102.665033 496.483377 0.215569 56.748386
1 3.937666 35.777043 1.712292 74.548566
2 7.799260 29.274121 1.864577 63.063011
3 62.741544 157.590753 0.164065 16.504325
4 9.718509 100.834423 0.517674 9.484836
temperature_adjusted_energy
0 2.097293
1 0.805908
2 0.869415
3 2.020989
4 2.221945
[5 rows x 25 columns]
๐ View run nervous-cat-853 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/968f58a1fba64340b8af9102ff518b81
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Feature combinations logged in MLflow.
Variance Threshold, Correlation Threshold, Fetaure Importaance
import pandas as pd
import numpy as np
import mlflow
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Assuming df is your dataset and 'target' is the target column
target = 'user_type'
# Start MLflow run
mlflow.start_run(run_name="Feature_Selection_Experiment")
# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]
# Log parameters
mlflow.log_param("target_column", target)
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)
# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()
# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].map(str)
# Categorical transformer for one-hot encoding with sparse=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Experiment 1: Feature selection using Correlation Threshold
def correlation_threshold(X, threshold=0.9):
# Check if the data is a sparse matrix and convert it to dense if so
if isinstance(X, np.ndarray):
X = pd.DataFrame(X)
elif hasattr(X, 'toarray'): # If it's a sparse matrix, convert it
X = pd.DataFrame(X.toarray())
correlation_matrix = X.corr().abs()
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
X_selected = X.drop(columns=to_drop)
return X_selected
# Apply the preprocessor and correlation threshold
X_processed = preprocessor.fit_transform(X)
X_corr_selected = correlation_threshold(pd.DataFrame(X_processed), threshold=0.9)
# Log Correlation Threshold results
mlflow.log_param("correlation_threshold", 0.9)
mlflow.log_metric("num_features_after_correlation_threshold", X_corr_selected.shape[1])
# Experiment 2: Feature selection using Feature Importance (Random Forest)
def feature_importance(X, y):
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]
return X.iloc[:, sorted_idx]
# Apply the preprocessor and feature importance
X_fi_processed = preprocessor.fit_transform(X)
X_fi_selected = feature_importance(pd.DataFrame(X_fi_processed), y)
# Log Feature Importance results
mlflow.log_param("feature_importance_model", "Random Forest")
mlflow.log_metric("num_features_after_feature_importance", X_fi_selected.shape[1])
# Experiment 3: Feature selection using Variance Threshold
def variance_threshold(X, threshold=0.01):
selector = VarianceThreshold(threshold=threshold)
X_selected = selector.fit_transform(X)
return X_selected
# Apply the preprocessor and variance threshold
X_var_processed = preprocessor.fit_transform(X)
X_var_selected = variance_threshold(pd.DataFrame(X_var_processed), threshold=0.01)
# Log Variance Threshold results
mlflow.log_param("variance_threshold", 0.01)
mlflow.log_metric("num_features_after_variance_threshold", X_var_selected.shape[1])
# Train and evaluate models to validate the selected features
def evaluate_model(X_selected, y):
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return accuracy_score(y_test, y_pred)
# Evaluate models after each feature selection method
accuracy_corr = evaluate_model(X_corr_selected, y)
accuracy_fi = evaluate_model(X_fi_selected, y)
accuracy_var = evaluate_model(X_var_selected, y)
# Log the model performance
mlflow.log_metric("accuracy_after_correlation_threshold", accuracy_corr)
mlflow.log_metric("accuracy_after_feature_importance", accuracy_fi)
mlflow.log_metric("accuracy_after_variance_threshold", accuracy_var)
# End the MLflow run
mlflow.end_run()
# Print summary
print(f"Accuracy after Correlation Threshold: {accuracy_corr:.4f}")
print(f"Accuracy after Feature Importance: {accuracy_fi:.4f}")
print(f"Accuracy after Variance Threshold: {accuracy_var:.4f}")
๐ View run Feature_Selection_Experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/73e3cabab4084f83b3d2ce84a656390e
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Accuracy after Correlation Threshold: 0.2952
Accuracy after Feature Importance: 0.2952
Accuracy after Variance Threshold: 0.2996
Priniple Component Analysis
import pandas as pd
import numpy as np
import mlflow
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
# Assuming df is your dataset and 'user_type' is the target column
target = 'user_type'
# Start MLflow run
mlflow.start_run(run_name="PCA_Dimensionality_Reduction")
# Split dataset into features and target
X = df.drop(columns=[target])
y = df[target]
# Log parameters
mlflow.log_param("target_column", target)
# Identify categorical and numerical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_features = X.select_dtypes(exclude=['object', 'category']).columns.tolist()
# Log categorical and numerical features
mlflow.log_param("categorical_features", categorical_features)
mlflow.log_param("numeric_features", numeric_features)
# Preprocessing pipeline for categorical and numerical features
numeric_transformer = StandardScaler()
# Convert categorical columns to string to avoid mixed types
X[categorical_features] = X[categorical_features].applymap(str)
# Categorical transformer for one-hot encoding with sparse_output=False (dense output)
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
# Combine the transformers using ColumnTransformer
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
])
# Apply preprocessing to the features
X_processed = preprocessor.fit_transform(X)
# Apply PCA for dimensionality reduction
pca = PCA()
X_pca = pca.fit_transform(X_processed)
# Log the explained variance ratio
explained_variance_ratio = pca.explained_variance_ratio_
mlflow.log_param("explained_variance_ratio", explained_variance_ratio.tolist())
# Create a scree plot to visualize the explained variance
plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title("Scree Plot: Explained Variance Ratio per Principal Component")
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance Ratio")
plt.grid(True)
plt.tight_layout()
# Save the plot to a file
scree_plot_path = "scree_plot.png"
plt.savefig(scree_plot_path)
# Show the plot in the output
plt.show()
# Log the scree plot in MLFlow
mlflow.log_artifact(scree_plot_path)
# Determine how many components to keep based on cumulative explained variance
cumulative_variance = np.cumsum(explained_variance_ratio)
mlflow.log_param("cumulative_variance", cumulative_variance.tolist())
# Log the number of components based on the desired explained variance threshold
threshold = 0.95 # Choose the threshold for explained variance
num_components = np.argmax(cumulative_variance >= threshold) + 1
mlflow.log_param("num_components_selected", num_components)
# Apply PCA with the selected number of components
pca_selected = PCA(n_components=num_components)
X_pca_selected = pca_selected.fit_transform(X_processed)
# Log results for the reduced dataset
mlflow.log_param("num_features_before_pca", X_processed.shape[1])
mlflow.log_param("num_features_after_pca", X_pca_selected.shape[1])
# Train and evaluate model on PCA-reduced features
def evaluate_model(X_selected, y):
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
return accuracy_score(y_test, y_pred)
accuracy_pca = evaluate_model(X_pca_selected, y)
# Log the model performance
mlflow.log_metric("accuracy_after_pca", accuracy_pca)
# End the MLflow run
mlflow.end_run()
# Print the results
print(f"Accuracy after PCA: {accuracy_pca:.4f}")
print(f"Number of components selected: {num_components}")
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_13263/4008606804.py:38: FutureWarning: DataFrame.applymap has been deprecated. Use DataFrame.map instead.
X[categorical_features] = X[categorical_features].applymap(str)
๐ View run PCA_Dimensionality_Reduction at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/2fee16b32868472fa613e5d8eb7934b0
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Accuracy after PCA: 0.3040
Number of components selected: 806
Custome Experiment 2
import mlflow
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
import numpy as np
with mlflow.start_run(nested=True):
mlflow.log_param("model", "SVM Classifier")
mlflow.log_param("scaler", "StandardScaler")
cv_results = cross_val_score(model_pipeline, X_train, y_train, cv=3, scoring='f1_macro')
mlflow.log_metric("cv_f1_mean", cv_results.mean())
mlflow.log_metric("cv_f1_std", cv_results.std())
model_pipeline.fit(X_train, y_train)
y_pred = model_pipeline.predict(X_test)
svm_f1_score= f1_score(y_test, y_pred, average='macro')
mlflow.log_metric("f1_test", f1)
cm = confusion_matrix(y_test, y_pred)
class_index = 0
tp = cm[class_index, class_index]
fn = cm[class_index].sum() - tp
fp = cm[:, class_index].sum() - tp
tn = cm.sum() - (tp + fn + fp)
mlflow.log_metric(f"TP_class_{class_index}", tp)
mlflow.log_metric(f"TN_class_{class_index}", tn)
mlflow.log_metric(f"FP_class_{class_index}", fp)
mlflow.log_metric(f"FN_class_{class_index}", fn)
mlflow.sklearn.log_model(model_pipeline, "svm_model")
print(f"Logged F1 score to MLFlow: {f1:.4f}")
print(f"Logged CV results (mean): {cv_results.mean():.4f}")
print(f"Logged CV results (std): {cv_results.std():.4f}")
print(f"Confusion Matrix:\n{cm}")
print(f"True Positives (Class {class_index}): {tp}")
print(f"True Negatives (Class {class_index}): {tn}")
print(f"False Positives (Class {class_index}): {fp}")
print(f"False Negatives (Class {class_index}): {fn}")
2024/12/21 09:59:08 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Logged F1 score to MLFlow: 0.2980
Logged CV results (mean): 0.3147
Logged CV results (std): 0.0145
Confusion Matrix:
[[12 68 30]
[13 76 19]
[15 79 28]]
True Positives (Class 0): 12
True Negatives (Class 0): 202
False Positives (Class 0): 28
False Negatives (Class 0): 98
๐ View run debonair-robin-995 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/291de19f1d7b44909c5ea00c2c0c8a28
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
Custom Experiment 2
import mlflow
import mlflow.sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
numerical_columns = df.select_dtypes(include=["float64", "int64"]).columns
imputer = SimpleImputer(strategy="median")
df[numerical_columns] = imputer.fit_transform(df[numerical_columns])
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df[numerical_columns]), columns=numerical_columns)
label_encoder = LabelEncoder()
df['charger_type_encoded'] = label_encoder.fit_transform(df['charger_type'])
tsne = TSNE(n_components=2, random_state=42)
df_tsne = tsne.fit_transform(df_scaled)
plt.figure(figsize=(8, 6))
plt.scatter(df_tsne[:, 0], df_tsne[:, 1], c=df['charger_type_encoded'], cmap='viridis', s=50, alpha=0.7)
plt.title('t-SNE visualization of the data')
plt.colorbar(label='user_type')
plt.show()
X = df_scaled
y = df['user_type']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
param_grid = {
'n_neighbors': [3, 5, 7, 9, 11],
'metric': ['euclidean', 'manhattan'],
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(knn, param_grid, cv=5, n_jobs=-1, scoring='accuracy')
with mlflow.start_run():
mlflow.log_param("cv_folds", 5)
mlflow.log_param("param_grid", param_grid)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
best_knn = grid_search.best_estimator_
mlflow.log_params(best_params)
y_pred = best_knn.predict(X_val)
cm = confusion_matrix(y_val, y_pred)
report = classification_report(y_val, y_pred, output_dict=True)
mlflow.log_metric("accuracy", report["accuracy"])
mlflow.log_metric("macro_avg_f1", report["macro avg"]["f1-score"])
mlflow.log_metric("weighted_avg_f1", report["weighted avg"]["f1-score"])
mlflow.log_metric("precision", report["macro avg"]["precision"])
mlflow.log_metric("recall", report["macro avg"]["recall"])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=np.unique(y), yticklabels=np.unique(y))
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png")
mlflow.log_artifact("confusion_matrix.png")
mlflow.sklearn.log_model(best_knn, "knn_model")
print(f"Best Parameters: {best_params}")
print(f"Confusion Matrix:\n{cm}")
print(f"Classification Report:\n{classification_report(y_val, y_pred)}")
2024/12/21 09:59:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.
Best Parameters: {'metric': 'euclidean', 'n_neighbors': 5}
Confusion Matrix:
[[29 33 16]
[33 25 12]
[37 30 12]]
Classification Report:
precision recall f1-score support
Casual Driver 0.29 0.37 0.33 78
Commuter 0.28 0.36 0.32 70
Long-Distance Traveler 0.30 0.15 0.20 79
accuracy 0.29 227
macro avg 0.29 0.29 0.28 227
weighted avg 0.29 0.29 0.28 227
๐ View run unruly-ox-159 at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0/runs/78969f4e42d645789833917c9642a5ed
๐งช View experiment at: https://dagshub.com/saisatvikh/final_repo.mlflow/#/experiments/0
F1- Scores Comparision
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
model_names = ['Logistic Regression','Random Forest','Ridge Classifier','SVM']
f1_scores = [logistic_f1_score,random_forest_f1,ridge_f1_score,svm_f1_score]
f1_scores_df = pd.DataFrame({
'Model': model_names,
'F1-Score': f1_scores
})
f1_df = f1_scores_df.sort_values(by='F1-Score', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')
plt.title('Model Comparison Based on F1-Score', fontsize=16)
plt.xlabel('F1-Score', fontsize=14)
plt.ylabel('Model', fontsize=14)
plt.show()
/var/folders/5q/38fn8x6x05j5m61tvnrj7t040000gn/T/ipykernel_13263/261019361.py:17: FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.
sns.barplot(x='F1-Score', y='Model', data=f1_df, palette='viridis')